{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python39164bit3e81fb3f53ff41368dd31efd247a2ba2",
   "display_name": "Python 3.9.1 64-bit"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "source": [
    "## 数据抓取"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from lxml import etree\n",
    "import re\n",
    "import urllib\n",
    "import os\n",
    "import zipfile\n",
    "import pandas as pd\n",
    "from sqlalchemy import create_engine\n",
    "import threading as thi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "20 files in page 1 have been downloaded successfully\n",
      "All files in page 1 have been downloaded successfully\n",
      "20 files have been downloaded totally\n",
      "20 files in page 2 have been downloaded successfully\n",
      "All files in page 2 have been downloaded successfully\n",
      "40 files have been downloaded totally\n",
      "This URL couldn't be downloaded:  http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000074336.html\n",
      "19 files in page 3 have been downloaded successfully\n",
      "All files in page 3 have been downloaded successfully\n",
      "59 files have been downloaded totally\n",
      "This URL couldn't be downloaded:  http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000074216.html\n",
      "19 files in page 4 have been downloaded successfully\n",
      "All files in page 4 have been downloaded successfully\n",
      "78 files have been downloaded totally\n",
      "20 files in page 5 have been downloaded successfully\n",
      "All files in page 5 have been downloaded successfully\n",
      "98 files have been downloaded totally\n",
      "20 files in page 6 have been downloaded successfully\n",
      "All files in page 6 have been downloaded successfully\n",
      "118 files have been downloaded totally\n",
      "20 files in page 7 have been downloaded successfully\n",
      "All files in page 7 have been downloaded successfully\n",
      "138 files have been downloaded totally\n",
      "20 files in page 8 have been downloaded successfully\n",
      "All files in page 8 have been downloaded successfully\n",
      "158 files have been downloaded totally\n",
      "20 files in page 9 have been downloaded successfully\n",
      "All files in page 9 have been downloaded successfully\n",
      "178 files have been downloaded totally\n",
      "This URL couldn't be downloaded:  http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000006300183.html\n",
      "19 files in page 10 have been downloaded successfully\n",
      "All files in page 10 have been downloaded successfully\n",
      "197 files have been downloaded totally\n",
      "20 files in page 11 have been downloaded successfully\n",
      "All files in page 11 have been downloaded successfully\n",
      "217 files have been downloaded totally\n",
      "20 files in page 12 have been downloaded successfully\n",
      "All files in page 12 have been downloaded successfully\n",
      "237 files have been downloaded totally\n",
      "20 files in page 13 have been downloaded successfully\n",
      "All files in page 13 have been downloaded successfully\n",
      "257 files have been downloaded totally\n",
      "20 files in page 14 have been downloaded successfully\n",
      "All files in page 14 have been downloaded successfully\n",
      "277 files have been downloaded totally\n",
      "20 files in page 15 have been downloaded successfully\n",
      "All files in page 15 have been downloaded successfully\n",
      "297 files have been downloaded totally\n",
      "20 files in page 16 have been downloaded successfully\n",
      "All files in page 16 have been downloaded successfully\n",
      "317 files have been downloaded totally\n",
      "20 files in page 17 have been downloaded successfully\n",
      "All files in page 17 have been downloaded successfully\n",
      "337 files have been downloaded totally\n",
      "20 files in page 18 have been downloaded successfully\n",
      "All files in page 18 have been downloaded successfully\n",
      "357 files have been downloaded totally\n",
      "20 files in page 19 have been downloaded successfully\n",
      "All files in page 19 have been downloaded successfully\n",
      "377 files have been downloaded totally\n",
      "This URL couldn't be downloaded:  http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000071350.html\n",
      "19 files in page 20 have been downloaded successfully\n",
      "All files in page 20 have been downloaded successfully\n",
      "396 files have been downloaded totally\n",
      "20 files in page 21 have been downloaded successfully\n",
      "All files in page 21 have been downloaded successfully\n",
      "416 files have been downloaded totally\n",
      "20 files in page 22 have been downloaded successfully\n",
      "All files in page 22 have been downloaded successfully\n",
      "436 files have been downloaded totally\n",
      "20 files in page 23 have been downloaded successfully\n",
      "All files in page 23 have been downloaded successfully\n",
      "456 files have been downloaded totally\n",
      "20 files in page 24 have been downloaded successfully\n",
      "All files in page 24 have been downloaded successfully\n",
      "476 files have been downloaded totally\n",
      "20 files in page 25 have been downloaded successfully\n",
      "All files in page 25 have been downloaded successfully\n",
      "496 files have been downloaded totally\n",
      "20 files in page 26 have been downloaded successfully\n",
      "All files in page 26 have been downloaded successfully\n",
      "516 files have been downloaded totally\n",
      "20 files in page 27 have been downloaded successfully\n",
      "All files in page 27 have been downloaded successfully\n",
      "536 files have been downloaded totally\n",
      "20 files in page 28 have been downloaded successfully\n",
      "All files in page 28 have been downloaded successfully\n",
      "556 files have been downloaded totally\n",
      "20 files in page 29 have been downloaded successfully\n",
      "All files in page 29 have been downloaded successfully\n",
      "576 files have been downloaded totally\n",
      "20 files in page 30 have been downloaded successfully\n",
      "All files in page 30 have been downloaded successfully\n",
      "596 files have been downloaded totally\n",
      "20 files in page 31 have been downloaded successfully\n",
      "All files in page 31 have been downloaded successfully\n",
      "616 files have been downloaded totally\n",
      "20 files in page 32 have been downloaded successfully\n",
      "All files in page 32 have been downloaded successfully\n",
      "636 files have been downloaded totally\n",
      "20 files in page 33 have been downloaded successfully\n",
      "All files in page 33 have been downloaded successfully\n",
      "656 files have been downloaded totally\n",
      "20 files in page 34 have been downloaded successfully\n",
      "All files in page 34 have been downloaded successfully\n",
      "676 files have been downloaded totally\n",
      "20 files in page 35 have been downloaded successfully\n",
      "All files in page 35 have been downloaded successfully\n",
      "696 files have been downloaded totally\n",
      "20 files in page 36 have been downloaded successfully\n",
      "All files in page 36 have been downloaded successfully\n",
      "716 files have been downloaded totally\n",
      "20 files in page 37 have been downloaded successfully\n",
      "All files in page 37 have been downloaded successfully\n",
      "736 files have been downloaded totally\n",
      "20 files in page 38 have been downloaded successfully\n",
      "All files in page 38 have been downloaded successfully\n",
      "756 files have been downloaded totally\n",
      "20 files in page 39 have been downloaded successfully\n",
      "All files in page 39 have been downloaded successfully\n",
      "776 files have been downloaded totally\n",
      "20 files in page 40 have been downloaded successfully\n",
      "All files in page 40 have been downloaded successfully\n",
      "796 files have been downloaded totally\n",
      "20 files in page 41 have been downloaded successfully\n",
      "All files in page 41 have been downloaded successfully\n",
      "816 files have been downloaded totally\n",
      "20 files in page 42 have been downloaded successfully\n",
      "All files in page 42 have been downloaded successfully\n",
      "836 files have been downloaded totally\n",
      "20 files in page 43 have been downloaded successfully\n",
      "All files in page 43 have been downloaded successfully\n",
      "856 files have been downloaded totally\n",
      "20 files in page 44 have been downloaded successfully\n",
      "All files in page 44 have been downloaded successfully\n",
      "876 files have been downloaded totally\n",
      "20 files in page 45 have been downloaded successfully\n",
      "All files in page 45 have been downloaded successfully\n",
      "896 files have been downloaded totally\n",
      "20 files in page 46 have been downloaded successfully\n",
      "All files in page 46 have been downloaded successfully\n",
      "916 files have been downloaded totally\n",
      "20 files in page 47 have been downloaded successfully\n",
      "All files in page 47 have been downloaded successfully\n",
      "936 files have been downloaded totally\n",
      "20 files in page 48 have been downloaded successfully\n",
      "All files in page 48 have been downloaded successfully\n",
      "956 files have been downloaded totally\n",
      "20 files in page 49 have been downloaded successfully\n",
      "All files in page 49 have been downloaded successfully\n",
      "976 files have been downloaded totally\n",
      "20 files in page 50 have been downloaded successfully\n",
      "All files in page 50 have been downloaded successfully\n",
      "996 files have been downloaded totally\n",
      "20 files in page 51 have been downloaded successfully\n",
      "All files in page 51 have been downloaded successfully\n",
      "1016 files have been downloaded totally\n",
      "20 files in page 52 have been downloaded successfully\n",
      "All files in page 52 have been downloaded successfully\n",
      "1036 files have been downloaded totally\n",
      "Oops! Some problems occured in page 53\n",
      "19 files in page 53 have been downloaded successfully\n",
      "All files in page 53 have been downloaded successfully\n",
      "1055 files have been downloaded totally\n",
      "This URL couldn't be downloaded:  http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000065779.html\n",
      "19 files in page 54 have been downloaded successfully\n",
      "All files in page 54 have been downloaded successfully\n",
      "1074 files have been downloaded totally\n",
      "20 files in page 55 have been downloaded successfully\n",
      "All files in page 55 have been downloaded successfully\n",
      "1094 files have been downloaded totally\n",
      "20 files in page 56 have been downloaded successfully\n",
      "All files in page 56 have been downloaded successfully\n",
      "1114 files have been downloaded totally\n",
      "20 files in page 57 have been downloaded successfully\n",
      "All files in page 57 have been downloaded successfully\n",
      "1134 files have been downloaded totally\n",
      "20 files in page 58 have been downloaded successfully\n",
      "All files in page 58 have been downloaded successfully\n",
      "1154 files have been downloaded totally\n",
      "20 files in page 59 have been downloaded successfully\n",
      "All files in page 59 have been downloaded successfully\n",
      "1174 files have been downloaded totally\n",
      "20 files in page 60 have been downloaded successfully\n",
      "All files in page 60 have been downloaded successfully\n",
      "1194 files have been downloaded totally\n",
      "20 files in page 61 have been downloaded successfully\n",
      "All files in page 61 have been downloaded successfully\n",
      "1214 files have been downloaded totally\n",
      "20 files in page 62 have been downloaded successfully\n",
      "All files in page 62 have been downloaded successfully\n",
      "1234 files have been downloaded totally\n",
      "20 files in page 63 have been downloaded successfully\n",
      "All files in page 63 have been downloaded successfully\n",
      "1254 files have been downloaded totally\n",
      "20 files in page 64 have been downloaded successfully\n",
      "All files in page 64 have been downloaded successfully\n",
      "1274 files have been downloaded totally\n",
      "20 files in page 65 have been downloaded successfully\n",
      "All files in page 65 have been downloaded successfully\n",
      "1294 files have been downloaded totally\n",
      "20 files in page 66 have been downloaded successfully\n",
      "All files in page 66 have been downloaded successfully\n",
      "1314 files have been downloaded totally\n",
      "20 files in page 67 have been downloaded successfully\n",
      "All files in page 67 have been downloaded successfully\n",
      "1334 files have been downloaded totally\n",
      "20 files in page 68 have been downloaded successfully\n",
      "All files in page 68 have been downloaded successfully\n",
      "1354 files have been downloaded totally\n",
      "20 files in page 69 have been downloaded successfully\n",
      "All files in page 69 have been downloaded successfully\n",
      "1374 files have been downloaded totally\n",
      "20 files in page 70 have been downloaded successfully\n",
      "All files in page 70 have been downloaded successfully\n",
      "1394 files have been downloaded totally\n",
      "20 files in page 71 have been downloaded successfully\n",
      "All files in page 71 have been downloaded successfully\n",
      "1414 files have been downloaded totally\n",
      "20 files in page 72 have been downloaded successfully\n",
      "All files in page 72 have been downloaded successfully\n",
      "1434 files have been downloaded totally\n",
      "20 files in page 73 have been downloaded successfully\n",
      "All files in page 73 have been downloaded successfully\n",
      "1454 files have been downloaded totally\n",
      "20 files in page 74 have been downloaded successfully\n",
      "All files in page 74 have been downloaded successfully\n",
      "1474 files have been downloaded totally\n",
      "20 files in page 75 have been downloaded successfully\n",
      "All files in page 75 have been downloaded successfully\n",
      "1494 files have been downloaded totally\n",
      "20 files in page 76 have been downloaded successfully\n",
      "All files in page 76 have been downloaded successfully\n",
      "1514 files have been downloaded totally\n",
      "20 files in page 77 have been downloaded successfully\n",
      "All files in page 77 have been downloaded successfully\n",
      "1534 files have been downloaded totally\n",
      "20 files in page 78 have been downloaded successfully\n",
      "All files in page 78 have been downloaded successfully\n",
      "1554 files have been downloaded totally\n",
      "20 files in page 79 have been downloaded successfully\n",
      "All files in page 79 have been downloaded successfully\n",
      "1574 files have been downloaded totally\n",
      "20 files in page 80 have been downloaded successfully\n",
      "All files in page 80 have been downloaded successfully\n",
      "1594 files have been downloaded totally\n",
      "This URL couldn't be downloaded:  http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000060225.html\n",
      "This URL couldn't be downloaded:  http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000060224.html\n",
      "18 files in page 81 have been downloaded successfully\n",
      "All files in page 81 have been downloaded successfully\n",
      "1612 files have been downloaded totally\n"
     ]
    }
   ],
   "source": [
    "# 创建文件夹以存放招标文件\n",
    "os.mkdir('./Projectzipfiles')\n",
    "\n",
    "# 监控总下载数量\n",
    "download_success = 0\n",
    "download_failure = 0\n",
    "\n",
    "for i in range(1,3):\n",
    "    url = 'http://ecp.sgcc.com.cn/ecp1.0/project_list.jsp?site=global&column_code=014001001&project_type=1&company_id=&status=&project_name=&pageNo=%d'%i\n",
    "    header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}\n",
    "\n",
    "    # 国家电网不需要担心反爬，所以headers可加可不加\n",
    "    reponse = requests.get(url = url, headers = header) \n",
    "    # 查看返回数据\n",
    "    # reponse.text\n",
    "\n",
    "    '''\n",
    "    http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000075434.html\n",
    "    http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000075383.html\n",
    "    http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000075306.html\n",
    "    http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/9900000000000075325.html\n",
    "    # 从以上url可以看出，之后最后几位数字不一样，在网页查找中可以看见这些数字属于“a/onclick/showProjectDetail\"\n",
    "    '''\n",
    "\n",
    "    # 用xpath进行提取\n",
    "    xpathtree = etree.HTML(reponse.text)\n",
    "    res = xpathtree.xpath('//tr[@align=\"left\"]/td[@class=\"black40\"]/a')\n",
    "    # print(res, len(res))\n",
    "\n",
    "    # 监控每页下载数量\n",
    "    per_page_succ = 0\n",
    "\n",
    "    # 对res对象进行遍历提取onclick属性\n",
    "    for resi in res:\n",
    "        try:\n",
    "            detail = resi.xpath('./@onclick')[0]\n",
    "            # print(detail)\n",
    "            # 再利用正则表达式进行最后几位数的提取\n",
    "            patterns = 'showProjectDetail\\((.*?),\\'(.*?)\\'\\);'\n",
    "            # 正则表达式中的\"\\\"表示转义，将作为匹配表达式的部分识别的部分转化为纯字符串\n",
    "            # 正则表达式中有两个匹配组，group的作用就是选择所需要的组\n",
    "            num1 = re.search(patterns, detail).group(1)\n",
    "            num2 = re.search(patterns, detail).group(2)\n",
    "            # 正则表达式中的\\'(.*?)\\'已经用转义符将引号去掉了\n",
    "            # print(num2)\n",
    "\n",
    "            # 二级页面提取\n",
    "            url2 = 'http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/{}.html'.format(num2)\n",
    "            # 或者这样也可：url2 = 'http://ecp.sgcc.com.cn/ecp1.0/html/project/014001001/%s.html'%num2\n",
    "            # print(url2)\n",
    "\n",
    "            reponse2 = requests.get(url = url2, headers = header) \n",
    "            # print(reponse2.text)\n",
    "            xpathtree2 = etree.HTML(reponse2.text)\n",
    "            res2 = xpathtree2.xpath('//tr/td/text()')[1]\n",
    "            \n",
    "            # 项目招标状态\n",
    "            proj_status = re.search('(\\S+.*?)', res2).group(1)\n",
    "            # print(proj_status)\n",
    "\n",
    "            # 项目名称\n",
    "            proj_name = etree.HTML(reponse2.text).xpath('//tr/td[2]/text()')[2]\n",
    "            # print(proj_name)\n",
    "            proj_name = proj_status + '-' + proj_name # 为每个项目对应上招标状态\n",
    "\n",
    "            # 项目招标文件下载\n",
    "            install_part_url = etree.HTML(reponse2.text).xpath('//tr/td/a/@href')[0] # 该步骤获取的url只是下载链接的后半部分\n",
    "            # print(install_part_url)\n",
    "            original_part = 'http://ecp.sgcc.com.cn'\n",
    "            install_url = original_part + install_part_url # 拼接获得完整下载链接\n",
    "            # print(install_url)\n",
    "        except:\n",
    "            print('Oops! Some problems occured in page %d'%i)\n",
    "            continue\n",
    "\n",
    "        # 使用urllib进行下载\n",
    "        # filename如果不变的话，则每一次下载都会默认覆盖前一个下载，所以此处用前面得到的proj_name（20个）作为文件名\n",
    "        try:\n",
    "            urllib.request.urlretrieve(url=install_url, filename='./Projectzipfiles/%s.zip'%proj_name)\n",
    "            download_success += 1\n",
    "            per_page_succ += 1\n",
    "        except:\n",
    "            download_failure += 1\n",
    "            print(\"This URL couldn't be downloaded: \",url2)\n",
    "            continue # 即使下载失败则直接跳过，继续后面的下载\n",
    "    print('%s files in page %s have been downloaded successfully'%(per_page_succ, i))\n",
    "    print('All files in page %d have been downloaded successfully'%i)\n",
    "    print('%s files have been downloaded totally'%download_success)\n",
    "\n",
    "print('Download finished! Files downloaded successfully: ', download_success)\n",
    "print('Download finished! Fail to download: ', download_failure)\n",
    "\n",
    "\n"
   ]
  },
  {
   "source": [
    "## 文件批量解压"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Fail to uncompress this file: 已经截标-招标公告-国网信息通信产业集团有限公司2020年第四批集中采购项目公开招标（物资）.zip\n",
      "Fail to uncompress this file: 已经截标-招标公告-国网信息通信产业集团有限公司2020年第四批集中采购项目公开招标（物资）.zip\n",
      "Fail to uncompress this file: 已经截标-招标公告-国网信息通信产业集团有限公司2020年第四批集中采购项目公开招标（物资）.zip\n",
      "Fail to uncompress this file: 已经截标-招标公告-国网信息通信产业集团有限公司2020年第四批集中采购项目公开招标（物资）.zip\n",
      "Fail to uncompress this file: 已经截标-国家电网有限公司信息通信分公司2020年第二批物资类公开招标项目-招标公告.zip\n",
      "Fail to uncompress this file: 已经截标-国家电网有限公司信息通信分公司2020年第二批物资类公开招标项目-招标公告.zip\n",
      "Fail to uncompress this file: 已经截标-国家电网有限公司信息通信分公司2020年第二批物资类公开招标项目-招标公告.zip\n",
      "Fail to uncompress this file: 已经截标-国家电网有限公司信息通信分公司2020年第二批物资类公开招标项目-招标公告.zip\n",
      "Fail to uncompress this file: 已经截标-中国电力财务有限公司2020年第六批次集中采购（五批流标项目）.zip\n",
      "Fail to uncompress this file: 已经截标-中国电力财务有限公司2020年第六批次集中采购（五批流标项目）.zip\n",
      "Fail to uncompress this file: 已经截标-中国电力财务有限公司2020年第六批次集中采购（五批流标项目）.zip\n",
      "Fail to uncompress this file: 已经截标-国网信息通信产业集团有限公司2020年第三批集中采购项目公开招标（物资）.zip\n",
      "Fail to uncompress this file: 已经截标-国网信息通信产业集团有限公司2020年第三批集中采购项目公开招标（物资）.zip\n",
      "Fail to uncompress this file: 已经截标-国网信息通信产业集团有限公司2020年第三批集中采购项目公开招标（物资）.zip\n",
      "Fail to uncompress this file: 已经截标-国网信息通信产业集团有限公司2020年第三批集中采购项目公开招标（物资）.zip\n",
      "Fail to uncompress this file: 已经截标-国网信息通信产业集团有限公司2020年第三批集中采购项目公开招标（物资）.zip\n",
      "Fail to uncompress this file: 已经截标-国家电网有限公司信息通信分公司2020年第三批物资类公开招标项目-招标公告.zip\n",
      "Fail to uncompress this file: 已经截标-国家电网有限公司信息通信分公司2020年第三批物资类公开招标项目-招标公告.zip\n",
      "Fail to uncompress this file: 已经截标-国家电网有限公司信息通信分公司2020年第三批物资类公开招标项目-招标公告.zip\n",
      "Fail to uncompress this file: 已经截标-国家电网有限公司信息通信分公司2020年第三批物资类公开招标项目-招标公告.zip\n",
      "40 files uncompressed successfully!\n",
      "Fail to uncompress 20 files!\n"
     ]
    }
   ],
   "source": [
    "# zipfile.ZipFile('压缩文件名', '模式')\n",
    "try:\n",
    "    os.mkdir('./Project-files') # 存放解压后的文件\n",
    "except:\n",
    "    pass\n",
    "\n",
    "# 监控解压进度\n",
    "uncompress_success = 0\n",
    "uncompress_failure = 0\n",
    "\n",
    "for zips in os.listdir('./Projectzipfiles'):\n",
    "    # print(zips)\n",
    "    if zips is not zips.startswith('.'):\n",
    "        file_dir = re.search('(.*?).zip', zips).group(1) # 只取出文件名，不要后缀\n",
    "        # print(file_dir)\n",
    "\n",
    "        filecontents = zipfile.ZipFile('./Projectzipfiles/%s'%zips,'r') # 读取压缩文件\n",
    "        for file in filecontents.namelist():\n",
    "            try:\n",
    "                # print(file)\n",
    "                files_name = file.encode('cp437').decode('gbk') # 防止乱码\n",
    "                # print(files_name)\n",
    "                filecontents.extract(file, './Project-files/%s'%file_dir)\n",
    "                file = './Project-files/%s/%s'%(file_dir,file)\n",
    "                files_name = './Project-files/%s/%s'%(file_dir, files_name)\n",
    "                os.rename(file, files_name)\n",
    "            except:\n",
    "                uncompress_failure += 1\n",
    "                print('Fail to uncompress this file: %s'%zips)\n",
    "                continue\n",
    "        uncompress_success += 1\n",
    "    \n",
    "print('%s files uncompressed successfully!'%uncompress_success)\n",
    "print('Fail to uncompress %s files!'%uncompress_failure)\n",
    "                "
   ]
  },
  {
   "source": [
    "## 项目状态批量写入"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Successfully write 40 files\nFail to write 0 files\n"
     ]
    }
   ],
   "source": [
    "listdir = []\n",
    "# 排除隐藏文件\n",
    "for hidden in os.listdir('./Project-files'):\n",
    "    if not hidden.startswith('.'):\n",
    "        listdir.append(hidden)\n",
    "# listdir\n",
    "\n",
    "# 监控写入进度\n",
    "written_success = 0\n",
    "written_failure = 0\n",
    "\n",
    "for char in listdir:\n",
    "    try:\n",
    "        status = re.match('(\\w{4})', char).group()\n",
    "        status_written = open('./Project-files/%s/status.txt'%char, 'w') # 向各个项目文件夹中以文本形式写入状态\n",
    "        status_written.write(status)\n",
    "        written_success += 1\n",
    "    except:\n",
    "        written_failure += 1\n",
    "        print(\"This file couldn't be written: %s\"%char)\n",
    "        continue\n",
    "\n",
    "print('Successfully write %s files'%written_success)\n",
    "print('Fail to write %s files'%written_failure)"
   ]
  },
  {
   "source": [
    "## 货物清单合并、添加状态"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "listdir_sum = []\n",
    "# 排除隐藏文件\n",
    "for hidden in os.listdir('./Project-files'):\n",
    "    if not hidden.startswith('.'):\n",
    "        listdir_sum.append(hidden)\n",
    "\n",
    "proj_dataframe = []\n",
    "for i in listdir_sum:\n",
    "    i = os.path.join('./Project-files', i) # 将文件名（字符串）与目录拼接，获得相对路径\n",
    "    for j in os.listdir(i):\n",
    "        # print(j)\n",
    "        if re.search('[货物清单].*?.xls', j): # 判断文件中有无货物清单字样的excel文件，有则进行group\n",
    "            if not j.startswith('.'):\n",
    "                products_excel = re.search('[货物清单].*?.xls', j).group()\n",
    "                # print(products_excel)\n",
    "                excel_dir = os.path.join(i, products_excel) # 路径拼接，获得excel文件的准确路径\n",
    "                #print(excel_dir)\n",
    "                # 将清单数据用pandas导入(其中一个而已)\n",
    "                #prod_data = pd.read_excel(excel_dir)\n",
    "                \n",
    "                # 因每个Excel可能有多个sheet，所以不用read进行读取，需要对每个sheet进行遍历，如下\n",
    "                sheetall = pd.ExcelFile(excel_dir)\n",
    "\n",
    "                prostatus = open('%s/status.txt'%i, 'r') # 读取之前写入的项目状态文本\n",
    "                sta = prostatus.read()\n",
    "                # print(sta)\n",
    "                # print(sheetall.sheet_names)\n",
    "                for sheetnames in sheetall.sheet_names:\n",
    "                    # print(sheetnames)\n",
    "                    sh_df = sheetall.parse(sheet_name=sheetnames, skiprows = 1) # 读取进来均为dataframe格式，这一行的数据是每一个sheet的内容\n",
    "                    sh_df['项目状态'] = sta\n",
    "                    proj_dataframe.append(sh_df)\n",
    "                # print(len(proj_dataframe))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "     包号      网省采购申请行号        项目单位                    需求单位  \\\n",
       "0   包01  3.005138e+14     四川省电力公司       国网四川省电力公司天府新区供电公司   \n",
       "1   包01  3.005138e+14     四川省电力公司       国网四川省电力公司天府新区供电公司   \n",
       "2   包01  3.005126e+14     四川省电力公司               国网四川映秀湾电厂   \n",
       "3   包01  3.005145e+14     四川省电力公司         国网四川省电力公司甘孜供电公司   \n",
       "4   包01  3.005142e+14     四川省电力公司         国网四川省电力公司巴中供电公司   \n",
       "..  ...           ...         ...                     ...   \n",
       "1    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "2    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "3    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "4    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "5    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "\n",
       "                          项目名称 工程电压等级                   物资名称  \\\n",
       "0   虚拟项目—国网四川电力2020年度省公司协议库存招标      无           布电线,BV,铜,4,1   \n",
       "1   虚拟项目—国网四川电力2020年度省公司协议库存招标      无          布电线,BV,铜,35,1   \n",
       "2   虚拟项目—国网四川电力2020年度省公司协议库存招标      无           布电线,BV,铜,4,1   \n",
       "3   虚拟项目—国网四川电力2020年度省公司协议库存招标      无          布电线,BV,铜,35,1   \n",
       "4   虚拟项目—国网四川电力2020年度省公司协议库存招标      无           布电线,BV,铜,4,1   \n",
       "..                         ...    ...                    ...   \n",
       "1                       绩溪基建项目      无      精密空调,双模,40kW,风冷,下   \n",
       "2                       绩溪基建项目      无                  防静电地板   \n",
       "3                       绩溪基建项目      无     UPS电源（不间断电源）,40kVA   \n",
       "4                       绩溪基建项目      无  机柜,600mm,2000mm,600mm   \n",
       "5                       绩溪基建项目      无               机房环境监控系统   \n",
       "\n",
       "                                            物资描述   单位        数量  ... 5.195  \\\n",
       "0                  布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1    米    568.00  ...   NaN   \n",
       "1                 布电线-型号:BV,导体材质:铜,截面mm2:35,芯数:1    米   4536.00  ...   NaN   \n",
       "2                  布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1    米     73.68  ...   NaN   \n",
       "3                 布电线-型号:BV,导体材质:铜,截面mm2:35,芯数:1    米  30000.00  ...   NaN   \n",
       "4                  布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1    米   3000.00  ...   NaN   \n",
       "..                                           ...  ...       ...  ...   ...   \n",
       "1   精密空调-组合方式:双模,额定制冷量Kw:40kW,机组冷却方式:风冷,送/回风方式:下    台      1.00  ...   NaN   \n",
       "2                                          防静电地板  平方米     70.00  ...   NaN   \n",
       "3                        UPS电源（不间断电源）-额定容量:40kVA    套      1.00  ...   NaN   \n",
       "4           机柜-宽度mm:600mm,高度mm:2000mm,深度mm:600mm    套     10.00  ...   NaN   \n",
       "5                                       机房环境监控系统    套      1.00  ...   NaN   \n",
       "\n",
       "   合同签订后30日  ZXZB- SGS- WZ20LX- NY-002 风电机组及其附属设备 风电机组及其附属设备.1 风电机组设备  \\\n",
       "0       NaN                        NaN        NaN          NaN    NaN   \n",
       "1       NaN                        NaN        NaN          NaN    NaN   \n",
       "2       NaN                        NaN        NaN          NaN    NaN   \n",
       "3       NaN                        NaN        NaN          NaN    NaN   \n",
       "4       NaN                        NaN        NaN          NaN    NaN   \n",
       "..      ...                        ...        ...          ...    ...   \n",
       "1       NaN                        NaN        NaN          NaN    NaN   \n",
       "2       NaN                        NaN        NaN          NaN    NaN   \n",
       "3       NaN                        NaN        NaN          NaN    NaN   \n",
       "4       NaN                        NaN        NaN          NaN    NaN   \n",
       "5       NaN                        NaN        NaN          NaN    NaN   \n",
       "\n",
       "   所投机型（单机容量为2000kW及以上），总容量不超过6000kW 6000   kW 合同签订之日起3个月内完成供货  \n",
       "0                                NaN  NaN  NaN             NaN  \n",
       "1                                NaN  NaN  NaN             NaN  \n",
       "2                                NaN  NaN  NaN             NaN  \n",
       "3                                NaN  NaN  NaN             NaN  \n",
       "4                                NaN  NaN  NaN             NaN  \n",
       "..                               ...  ...  ...             ...  \n",
       "1                                NaN  NaN  NaN             NaN  \n",
       "2                                NaN  NaN  NaN             NaN  \n",
       "3                                NaN  NaN  NaN             NaN  \n",
       "4                                NaN  NaN  NaN             NaN  \n",
       "5                                NaN  NaN  NaN             NaN  \n",
       "\n",
       "[30950 rows x 54 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>包号</th>\n      <th>网省采购申请行号</th>\n      <th>项目单位</th>\n      <th>需求单位</th>\n      <th>项目名称</th>\n      <th>工程电压等级</th>\n      <th>物资名称</th>\n      <th>物资描述</th>\n      <th>单位</th>\n      <th>数量</th>\n      <th>...</th>\n      <th>5.195</th>\n      <th>合同签订后30日</th>\n      <th>ZXZB- SGS- WZ20LX- NY-002</th>\n      <th>风电机组及其附属设备</th>\n      <th>风电机组及其附属设备.1</th>\n      <th>风电机组设备</th>\n      <th>所投机型（单机容量为2000kW及以上），总容量不超过6000kW</th>\n      <th>6000</th>\n      <th>kW</th>\n      <th>合同签订之日起3个月内完成供货</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>包01</td>\n      <td>3.005138e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川省电力公司天府新区供电公司</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,4,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1</td>\n      <td>米</td>\n      <td>568.00</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>包01</td>\n      <td>3.005138e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川省电力公司天府新区供电公司</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,35,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:35,芯数:1</td>\n      <td>米</td>\n      <td>4536.00</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>包01</td>\n      <td>3.005126e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川映秀湾电厂</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,4,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1</td>\n      <td>米</td>\n      <td>73.68</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>包01</td>\n      <td>3.005145e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川省电力公司甘孜供电公司</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,35,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:35,芯数:1</td>\n      <td>米</td>\n      <td>30000.00</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>包01</td>\n      <td>3.005142e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川省电力公司巴中供电公司</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,4,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1</td>\n      <td>米</td>\n      <td>3000.00</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>精密空调,双模,40kW,风冷,下</td>\n      <td>精密空调-组合方式:双模,额定制冷量Kw:40kW,机组冷却方式:风冷,送/回风方式:下</td>\n      <td>台</td>\n      <td>1.00</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>防静电地板</td>\n      <td>防静电地板</td>\n      <td>平方米</td>\n      <td>70.00</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>UPS电源（不间断电源）,40kVA</td>\n      <td>UPS电源（不间断电源）-额定容量:40kVA</td>\n      <td>套</td>\n      <td>1.00</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>机柜,600mm,2000mm,600mm</td>\n      <td>机柜-宽度mm:600mm,高度mm:2000mm,深度mm:600mm</td>\n      <td>套</td>\n      <td>10.00</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>机房环境监控系统</td>\n      <td>机房环境监控系统</td>\n      <td>套</td>\n      <td>1.00</td>\n      <td>...</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>30950 rows × 54 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 24
    }
   ],
   "source": [
    "# 在上一方块中已经将每一个sheet独立读进proj_dataframe中，现在使用concat进行顺序拼接（同名列进行合并）\n",
    "total_proj_data = pd.concat(proj_dataframe)\n",
    "total_proj_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "0    正在评标\n",
       "1    正在评标\n",
       "2    正在评标\n",
       "3    正在评标\n",
       "4    正在评标\n",
       "     ... \n",
       "1    正在评标\n",
       "2    正在评标\n",
       "3    正在评标\n",
       "4    正在评标\n",
       "5    正在评标\n",
       "Name: 项目状态, Length: 30950, dtype: object"
      ]
     },
     "metadata": {},
     "execution_count": 25
    }
   ],
   "source": [
    "# 查看项目状态是否正常写入\n",
    "total_proj_data['项目状态']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "Index([                                                                                                                                                                   '包号',\n",
       "                                                                                                                                                                    '网省采购申请行号',\n",
       "                                                                                                                                                                        '项目单位',\n",
       "                                                                                                                                                                        '需求单位',\n",
       "                                                                                                                                                                        '项目名称',\n",
       "                                                                                                                                                                      '工程电压等级',\n",
       "                                                                                                                                                                        '物资名称',\n",
       "                                                                                                                                                                        '物资描述',\n",
       "                                                                                                                                                                          '单位',\n",
       "                                                                                                                                                                          '数量',\n",
       "                                                                                                                                                                        '交货日期',\n",
       "                                                                                                                                                                        '交货地点',\n",
       "                                                                                                                                                                          '备注',\n",
       "                                                                                                                                                                      '技术规范ID',\n",
       "                                                                                                                                                                        '项目状态',\n",
       "                                                                                                                                                   'ZXZB- SGS- WZ20LX- NY-001',\n",
       "                                                                                                                                                                    '储能系统成套设备',\n",
       "                                                                                                                                                                  '储能系统成套设备.1',\n",
       "                                                                                                                                                                             1,\n",
       "                                                                                                                                                              '国网福建综合能源服务有限公司',\n",
       "                                                                                                                                                              '宁德霞浦西洋岛微电网示范项目',\n",
       "                                                                                                                                                                  '储能系统成套设备.2',\n",
       "                                                                                                                      '锂电池单体电芯容量规格在260Ah以上、循环寿命6000次以上；储能变流器630kW；40尺标准规格集装箱。',\n",
       "                                                                                                                                                                             2,\n",
       "                                                                                                                                                                         'MWh',\n",
       "                                                                                                                                                              '合同签订之日起90天完成供货',\n",
       "                                                                                                                                                                      '需方指定地点',\n",
       "                                                                                                                                                                           '1',\n",
       "                                                                                                                                                             'JYY-2020XW02-01',\n",
       "                                                                                                                                                                 '院会议室音视频设备采购',\n",
       "                                 '国网经济技术研究院有限公司现有会议室为A505、601、602，共3间。今年受疫情影响，远程视频会议数量有较大增长，A601、602需新增配套移动摄像头、软视频会议主机（含音视频采集卡）等设备，与原有系统配套使用；A505会议室需更换视频矩阵、高清显示器等设备，并与原系统兼容。',\n",
       "       '1.接受代理商应答，不接受联合体；\\n2.主要设备产品需提供原厂商三年质保承诺；\\n3.主要设备产品应具有CNAS机构认证的第三方检测报告；\\n4.2017年1月1日至首次应答截止日，具有同类产品销售业绩3项及以上，并提供合同证明。（时间以合同签订日期为准，需提供用户合同封面，合同签字盖章页复印件及证明合同内容的合同页复印件）。',\n",
       "                                                                                                                                                                    '60:30:10',\n",
       "                                                                                                                                                                         '3个月',\n",
       "                                                                                                                                                                         '1.1',\n",
       "                                                                                                                                                                           0.3,\n",
       "                                                                                                           '应答文件递交截止时间：2020年7月2日8:30（北京时间）\\n发送解压密码时间：2020年7月2日8:35-8:45（北京时间）',\n",
       "                                                                                                                                                   'ZXZB- SGS- WZ20LX- NY-003',\n",
       "                                                                                                                                                                  '单晶单玻带框光伏组件',\n",
       "                                                                                                                                                               '包1-单晶单玻带框光伏组件',\n",
       "                                                                                                                                       '福建南平太阳电缆股份有限公司10MW分布式光伏发电项目配套物资采购招标项目',\n",
       "                                                                                                                                                     '335Wp及以上单晶单面发电高效电池组件，带框',\n",
       "                                                                                                                                                                  'Unnamed: 7',\n",
       "                                                                                                                                                                          'MW',\n",
       "                                                                                                                                                                         5.195,\n",
       "                                                                                                                                                                    '合同签订后30日',\n",
       "                                                                                                                                                   'ZXZB- SGS- WZ20LX- NY-002',\n",
       "                                                                                                                                                                  '风电机组及其附属设备',\n",
       "                                                                                                                                                                '风电机组及其附属设备.1',\n",
       "                                                                                                                                                                      '风电机组设备',\n",
       "                                                                                                                                           '所投机型（单机容量为2000kW及以上），总容量不超过6000kW',\n",
       "                                                                                                                                                                          6000,\n",
       "                                                                                                                                                                          'kW',\n",
       "                                                                                                                                                             '合同签订之日起3个月内完成供货'],\n",
       "      dtype='object')"
      ]
     },
     "metadata": {},
     "execution_count": 26
    }
   ],
   "source": [
    "# 查看dataframe信息\n",
    "total_proj_data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\nInt64Index: 30950 entries, 0 to 5\nData columns (total 54 columns):\n #   Column                                                                                                                                                              Non-Null Count  Dtype  \n---  ------                                                                                                                                                              --------------  -----  \n 0   包号                                                                                                                                                                  30925 non-null  object \n 1   网省采购申请行号                                                                                                                                                            30925 non-null  float64\n 2   项目单位                                                                                                                                                                30925 non-null  object \n 3   需求单位                                                                                                                                                                30876 non-null  object \n 4   项目名称                                                                                                                                                                17978 non-null  object \n 5   工程电压等级                                                                                                                                                              30925 non-null  object \n 6   物资名称                                                                                                                                                                30925 non-null  object \n 7   物资描述                                                                                                                                                                30925 non-null  object \n 8   单位                                                                                                                                                                  30925 non-null  object \n 9   数量                                                                                                                                                                  30925 non-null  float64\n 10  交货日期                                                                                                                                                                30925 non-null  object \n 11  交货地点                                                                                                                                                                30925 non-null  object \n 12  备注                                                                                                                                                                  0 non-null      float64\n 13  技术规范ID                                                                                                                                                              30925 non-null  object \n 14  项目状态                                                                                                                                                                30950 non-null  object \n 15  ZXZB- SGS- WZ20LX- NY-001                                                                                                                                           1 non-null      object \n 16  储能系统成套设备                                                                                                                                                            1 non-null      object \n 17  储能系统成套设备.1                                                                                                                                                          1 non-null      object \n 18  1                                                                                                                                                                   6 non-null      float64\n 19  国网福建综合能源服务有限公司                                                                                                                                                      1 non-null      object \n 20  宁德霞浦西洋岛微电网示范项目                                                                                                                                                      1 non-null      object \n 21  储能系统成套设备.2                                                                                                                                                          5 non-null      object \n 22  锂电池单体电芯容量规格在260Ah以上、循环寿命6000次以上；储能变流器630kW；40尺标准规格集装箱。                                                                                                              1 non-null      object \n 23  2                                                                                                                                                                   5 non-null      object \n 24  MWh                                                                                                                                                                 5 non-null      object \n 25  合同签订之日起90天完成供货                                                                                                                                                      1 non-null      object \n 26  需方指定地点                                                                                                                                                              6 non-null      object \n 27  1                                                                                                                                                                   0 non-null      float64\n 28  JYY-2020XW02-01                                                                                                                                                     0 non-null      float64\n 29  院会议室音视频设备采购                                                                                                                                                         0 non-null      float64\n 30  国网经济技术研究院有限公司现有会议室为A505、601、602，共3间。今年受疫情影响，远程视频会议数量有较大增长，A601、602需新增配套移动摄像头、软视频会议主机（含音视频采集卡）等设备，与原有系统配套使用；A505会议室需更换视频矩阵、高清显示器等设备，并与原系统兼容。                         0 non-null      float64\n 31  1.接受代理商应答，不接受联合体；\n2.主要设备产品需提供原厂商三年质保承诺；\n3.主要设备产品应具有CNAS机构认证的第三方检测报告；\n4.2017年1月1日至首次应答截止日，具有同类产品销售业绩3项及以上，并提供合同证明。（时间以合同签订日期为准，需提供用户合同封面，合同签字盖章页复印件及证明合同内容的合同页复印件）。  0 non-null      float64\n 32  60:30:10                                                                                                                                                            0 non-null      float64\n 33  3个月                                                                                                                                                                 0 non-null      float64\n 34  1.1                                                                                                                                                                 0 non-null      float64\n 35  0.3                                                                                                                                                                 0 non-null      float64\n 36  应答文件递交截止时间：2020年7月2日8:30（北京时间）\n发送解压密码时间：2020年7月2日8:35-8:45（北京时间）                                                                                                    0 non-null      float64\n 37  ZXZB- SGS- WZ20LX- NY-003                                                                                                                                           5 non-null      object \n 38  单晶单玻带框光伏组件                                                                                                                                                          5 non-null      object \n 39  包1-单晶单玻带框光伏组件                                                                                                                                                       5 non-null      object \n 40  福建南平太阳电缆股份有限公司10MW分布式光伏发电项目配套物资采购招标项目                                                                                                                               0 non-null      float64\n 41  335Wp及以上单晶单面发电高效电池组件，带框                                                                                                                                             13 non-null     object \n 42  Unnamed: 7                                                                                                                                                          9 non-null      object \n 43  MW                                                                                                                                                                  13 non-null     object \n 44  5.195                                                                                                                                                               13 non-null     float64\n 45  合同签订后30日                                                                                                                                                            5 non-null      object \n 46  ZXZB- SGS- WZ20LX- NY-002                                                                                                                                           0 non-null      float64\n 47  风电机组及其附属设备                                                                                                                                                          0 non-null      float64\n 48  风电机组及其附属设备.1                                                                                                                                                        0 non-null      float64\n 49  风电机组设备                                                                                                                                                              4 non-null      object \n 50  所投机型（单机容量为2000kW及以上），总容量不超过6000kW                                                                                                                                   0 non-null      float64\n 51  6000                                                                                                                                                                4 non-null      object \n 52  kW                                                                                                                                                                  4 non-null      object \n 53  合同签订之日起3个月内完成供货                                                                                                                                                     0 non-null      float64\ndtypes: float64(21), object(33)\nmemory usage: 13.5+ MB\n"
     ]
    }
   ],
   "source": [
    "total_proj_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "     包号      网省采购申请行号        项目单位                    需求单位  \\\n",
       "0   包01  3.005138e+14     四川省电力公司       国网四川省电力公司天府新区供电公司   \n",
       "1   包01  3.005138e+14     四川省电力公司       国网四川省电力公司天府新区供电公司   \n",
       "2   包01  3.005126e+14     四川省电力公司               国网四川映秀湾电厂   \n",
       "3   包01  3.005145e+14     四川省电力公司         国网四川省电力公司甘孜供电公司   \n",
       "4   包01  3.005142e+14     四川省电力公司         国网四川省电力公司巴中供电公司   \n",
       "..  ...           ...         ...                     ...   \n",
       "1    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "2    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "3    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "4    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "5    包1  2.600015e+14  国网新源控股有限公司  国网新源控股有限公司安徽绩溪抽水蓄能有限公司   \n",
       "\n",
       "                          项目名称 工程电压等级                   物资名称  \\\n",
       "0   虚拟项目—国网四川电力2020年度省公司协议库存招标      无           布电线,BV,铜,4,1   \n",
       "1   虚拟项目—国网四川电力2020年度省公司协议库存招标      无          布电线,BV,铜,35,1   \n",
       "2   虚拟项目—国网四川电力2020年度省公司协议库存招标      无           布电线,BV,铜,4,1   \n",
       "3   虚拟项目—国网四川电力2020年度省公司协议库存招标      无          布电线,BV,铜,35,1   \n",
       "4   虚拟项目—国网四川电力2020年度省公司协议库存招标      无           布电线,BV,铜,4,1   \n",
       "..                         ...    ...                    ...   \n",
       "1                       绩溪基建项目      无      精密空调,双模,40kW,风冷,下   \n",
       "2                       绩溪基建项目      无                  防静电地板   \n",
       "3                       绩溪基建项目      无     UPS电源（不间断电源）,40kVA   \n",
       "4                       绩溪基建项目      无  机柜,600mm,2000mm,600mm   \n",
       "5                       绩溪基建项目      无               机房环境监控系统   \n",
       "\n",
       "                                            物资描述   单位        数量        交货日期  \\\n",
       "0                  布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1    米    568.00  2021-05-30   \n",
       "1                 布电线-型号:BV,导体材质:铜,截面mm2:35,芯数:1    米   4536.00  2021-05-30   \n",
       "2                  布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1    米     73.68  2021-05-30   \n",
       "3                 布电线-型号:BV,导体材质:铜,截面mm2:35,芯数:1    米  30000.00  2021-05-30   \n",
       "4                  布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1    米   3000.00  2021-05-30   \n",
       "..                                           ...  ...       ...         ...   \n",
       "1   精密空调-组合方式:双模,额定制冷量Kw:40kW,机组冷却方式:风冷,送/回风方式:下    台      1.00  2020-11-30   \n",
       "2                                          防静电地板  平方米     70.00  2020-11-30   \n",
       "3                        UPS电源（不间断电源）-额定容量:40kVA    套      1.00  2020-11-30   \n",
       "4           机柜-宽度mm:600mm,高度mm:2000mm,深度mm:600mm    套     10.00  2020-11-30   \n",
       "5                                       机房环境监控系统    套      1.00  2020-11-30   \n",
       "\n",
       "          交货地点  备注                技术规范ID  项目状态  \n",
       "0   买方指定仓库车板交货 NaN  19GH-500014806-00005  正在评标  \n",
       "1   买方指定仓库车板交货 NaN  19GH-500014806-00005  正在评标  \n",
       "2   买方指定仓库车板交货 NaN  19GH-500014806-00005  正在评标  \n",
       "3   买方指定仓库车板交货 NaN  19GH-500014806-00005  正在评标  \n",
       "4   买方指定仓库车板交货 NaN  19GH-500014806-00005  正在评标  \n",
       "..         ...  ..                   ...   ...  \n",
       "1   买方指定仓库地面交货 NaN  4623-500024336-00002  正在评标  \n",
       "2   买方指定仓库地面交货 NaN  4623-500024336-00002  正在评标  \n",
       "3   买方指定仓库地面交货 NaN  4623-500024336-00002  正在评标  \n",
       "4   买方指定仓库地面交货 NaN  4623-500024336-00002  正在评标  \n",
       "5   买方指定仓库地面交货 NaN  4623-500024336-00002  正在评标  \n",
       "\n",
       "[30950 rows x 15 columns]"
      ],
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>包号</th>\n      <th>网省采购申请行号</th>\n      <th>项目单位</th>\n      <th>需求单位</th>\n      <th>项目名称</th>\n      <th>工程电压等级</th>\n      <th>物资名称</th>\n      <th>物资描述</th>\n      <th>单位</th>\n      <th>数量</th>\n      <th>交货日期</th>\n      <th>交货地点</th>\n      <th>备注</th>\n      <th>技术规范ID</th>\n      <th>项目状态</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>包01</td>\n      <td>3.005138e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川省电力公司天府新区供电公司</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,4,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1</td>\n      <td>米</td>\n      <td>568.00</td>\n      <td>2021-05-30</td>\n      <td>买方指定仓库车板交货</td>\n      <td>NaN</td>\n      <td>19GH-500014806-00005</td>\n      <td>正在评标</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>包01</td>\n      <td>3.005138e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川省电力公司天府新区供电公司</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,35,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:35,芯数:1</td>\n      <td>米</td>\n      <td>4536.00</td>\n      <td>2021-05-30</td>\n      <td>买方指定仓库车板交货</td>\n      <td>NaN</td>\n      <td>19GH-500014806-00005</td>\n      <td>正在评标</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>包01</td>\n      <td>3.005126e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川映秀湾电厂</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,4,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1</td>\n      <td>米</td>\n      <td>73.68</td>\n      <td>2021-05-30</td>\n      <td>买方指定仓库车板交货</td>\n      <td>NaN</td>\n      <td>19GH-500014806-00005</td>\n      <td>正在评标</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>包01</td>\n      <td>3.005145e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川省电力公司甘孜供电公司</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,35,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:35,芯数:1</td>\n      <td>米</td>\n      <td>30000.00</td>\n      <td>2021-05-30</td>\n      <td>买方指定仓库车板交货</td>\n      <td>NaN</td>\n      <td>19GH-500014806-00005</td>\n      <td>正在评标</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>包01</td>\n      <td>3.005142e+14</td>\n      <td>四川省电力公司</td>\n      <td>国网四川省电力公司巴中供电公司</td>\n      <td>虚拟项目—国网四川电力2020年度省公司协议库存招标</td>\n      <td>无</td>\n      <td>布电线,BV,铜,4,1</td>\n      <td>布电线-型号:BV,导体材质:铜,截面mm2:4,芯数:1</td>\n      <td>米</td>\n      <td>3000.00</td>\n      <td>2021-05-30</td>\n      <td>买方指定仓库车板交货</td>\n      <td>NaN</td>\n      <td>19GH-500014806-00005</td>\n      <td>正在评标</td>\n    </tr>\n    <tr>\n      <th>...</th>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>精密空调,双模,40kW,风冷,下</td>\n      <td>精密空调-组合方式:双模,额定制冷量Kw:40kW,机组冷却方式:风冷,送/回风方式:下</td>\n      <td>台</td>\n      <td>1.00</td>\n      <td>2020-11-30</td>\n      <td>买方指定仓库地面交货</td>\n      <td>NaN</td>\n      <td>4623-500024336-00002</td>\n      <td>正在评标</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>防静电地板</td>\n      <td>防静电地板</td>\n      <td>平方米</td>\n      <td>70.00</td>\n      <td>2020-11-30</td>\n      <td>买方指定仓库地面交货</td>\n      <td>NaN</td>\n      <td>4623-500024336-00002</td>\n      <td>正在评标</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>UPS电源（不间断电源）,40kVA</td>\n      <td>UPS电源（不间断电源）-额定容量:40kVA</td>\n      <td>套</td>\n      <td>1.00</td>\n      <td>2020-11-30</td>\n      <td>买方指定仓库地面交货</td>\n      <td>NaN</td>\n      <td>4623-500024336-00002</td>\n      <td>正在评标</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>机柜,600mm,2000mm,600mm</td>\n      <td>机柜-宽度mm:600mm,高度mm:2000mm,深度mm:600mm</td>\n      <td>套</td>\n      <td>10.00</td>\n      <td>2020-11-30</td>\n      <td>买方指定仓库地面交货</td>\n      <td>NaN</td>\n      <td>4623-500024336-00002</td>\n      <td>正在评标</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>包1</td>\n      <td>2.600015e+14</td>\n      <td>国网新源控股有限公司</td>\n      <td>国网新源控股有限公司安徽绩溪抽水蓄能有限公司</td>\n      <td>绩溪基建项目</td>\n      <td>无</td>\n      <td>机房环境监控系统</td>\n      <td>机房环境监控系统</td>\n      <td>套</td>\n      <td>1.00</td>\n      <td>2020-11-30</td>\n      <td>买方指定仓库地面交货</td>\n      <td>NaN</td>\n      <td>4623-500024336-00002</td>\n      <td>正在评标</td>\n    </tr>\n  </tbody>\n</table>\n<p>30950 rows × 15 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 29
    }
   ],
   "source": [
    "# 根据需要只需要以下列：\n",
    "col = ['包号', '网省采购申请行号', '项目单位', '需求单位', '项目名称', '工程电压等级', '物资名称', '物资描述','单位', '数量', '交货日期', '交货地点', '备注', '技术规范ID', '项目状态']\n",
    "# 提取所需数据\n",
    "final_data = total_proj_data[col]\n",
    "final_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\nInt64Index: 30950 entries, 0 to 5\nData columns (total 15 columns):\n #   Column    Non-Null Count  Dtype  \n---  ------    --------------  -----  \n 0   包号        30925 non-null  object \n 1   网省采购申请行号  30925 non-null  float64\n 2   项目单位      30925 non-null  object \n 3   需求单位      30876 non-null  object \n 4   项目名称      17978 non-null  object \n 5   工程电压等级    30925 non-null  object \n 6   物资名称      30925 non-null  object \n 7   物资描述      30925 non-null  object \n 8   单位        30925 non-null  object \n 9   数量        30925 non-null  float64\n 10  交货日期      30925 non-null  object \n 11  交货地点      30925 non-null  object \n 12  备注        0 non-null      float64\n 13  技术规范ID    30925 non-null  object \n 14  项目状态      30950 non-null  object \ndtypes: float64(3), object(12)\nmemory usage: 4.3+ MB\n"
     ]
    }
   ],
   "source": [
    "final_data.info()"
   ]
  },
  {
   "source": [
    "## 数据清洗（除去空值）"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_data = final_data.dropna(subset=['包号'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\nInt64Index: 30925 entries, 0 to 5\nData columns (total 15 columns):\n #   Column    Non-Null Count  Dtype  \n---  ------    --------------  -----  \n 0   包号        30925 non-null  object \n 1   网省采购申请行号  30925 non-null  float64\n 2   项目单位      30925 non-null  object \n 3   需求单位      30876 non-null  object \n 4   项目名称      17978 non-null  object \n 5   工程电压等级    30925 non-null  object \n 6   物资名称      30925 non-null  object \n 7   物资描述      30925 non-null  object \n 8   单位        30925 non-null  object \n 9   数量        30925 non-null  float64\n 10  交货日期      30925 non-null  object \n 11  交货地点      30925 non-null  object \n 12  备注        0 non-null      float64\n 13  技术规范ID    30925 non-null  object \n 14  项目状态      30925 non-null  object \ndtypes: float64(3), object(12)\nmemory usage: 3.8+ MB\n"
     ]
    }
   ],
   "source": [
    "final_data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存csv文件\n",
    "# final_data.to_csv('project-test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存为数据库文件\n",
    "datasql = create_engine('mysql+pymysql://root:180116@localhost/projecttest').connect()\n",
    "final_data.to_sql(name='proj_demodata', con=datasql)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}